16 Samples from two putative native trees collected at two National Trust Sites; Sheringham Estate (n = 13), Felbrigg Estate (n = 3) were genotyped and analysed alongside 194 autochthonous samples from sites across Scotland. Principal Component analysis was performed to detect any underlying population structure. Additional Admixture analysis was performed using the LEA software to identify the ancestry components. The samples were also analysed against European samples using the same methods.
# Read in TASSEL 5 PCA values in Plink format from a text file into a data frame
df <- read.table("PC_newLEAFgenotypes_Mothers+NationalTrustSamples.txt", header = TRUE, sep = "\t")
# Read a Keyfile containing Taxa names and corresponding information of interest into a data frame
Keyfile <- read.table("newLEAFgenotypes_Mothers+NationalTrustSamples_Keyfile.txt", header = TRUE, sep = "\t")
# Merge the two data frames based on specific columns
merged_data <- merge(df, Keyfile, by.x = "FID", by.y = "Taxa", row.names = FALSE)
merged_data$Classification <- factor(merged_data$Classification, levels=c("Cairngorms", "Central Atlantic", "Hyper-oceanic", "Sheringham Estate", "Felbrigg Estate"))
NationalTrust_data <- subset(merged_data, Classification %in% c("Sheringham Estate", "Felbrigg Estate"))
eigen_percent <- read.table("Eigenvalues_newLEAFgenotypes_Mothers+NationalTrustSamples.txt", row.names = 1, header = TRUE, sep = "\t")
eigen_percent$proportion.of.total <- eigen_percent$proportion.of.total * 100
eigen_percent <- eigen_percent[ ,("proportion.of.total")]
Principal Component Analysis (PCA) of genetic data simplifies the complex genetic information of individuals into key patterns, helping us visualize and understand the underlying genetic relationships within a population.
# Create a ggplot2 plot
ClassificationPC <- ggplot(merged_data, aes(x = PC1, y = PC2, color = Classification)) +
geom_point() +
labs(x = paste0("Principal component 1 (",eigen_percent[2]," %)"),
y = paste0("Principal component 2 (",eigen_percent[3]," %)")) +
scale_color_manual(values = c("#b69140",
"#8d70c9",
"#6aa74d",
"#c8588c",
"#49adad")) +
stat_ellipse(data = NationalTrust_data, aes(group = NULL), color="red", type = "norm") +
labs(color = "Classification") # Change the legend title to "Classification"
# Convert the ggplot2 plot to an interactive Plotly plot
interactive_plotClassificationPC <- ggplotly(ClassificationPC, originalData = FALSE) # Set originalData to FALSE
# Display the interactive plot
interactive_plotClassificationPC
Figure 1: Eco-Geographic Diversity among Native Scots Pine Trees in the UK. This Principal Component Analysis (PCA) illustrates the genetic relationships among 210 Scots pine trees from the UK inferred from 31556 SNPs. Each dot on the plot represents an individual tree, coloured by their eco-geographic classification. Putative native populations (n = 13) are labelled accordingly. The distance between dots reflects the degree of genetic difference or similarity, whereas closer dots represent closer genetic relationships. The ‘Explained Variance’ score quantifies how effectively the PCA simplifies the genetic information. Putative amples highlighted by red ellipse.
SourcePC <- ggplot(merged_data, aes(x = PC1, y = PC2, color = Source)) +
geom_point() +
labs(x = paste0("Principal component 1 (",eigen_percent[2]," %)"),
y = paste0("Principal component 2 (",eigen_percent[3]," %)")) +
scale_color_manual(values = c("#b1346b",
"#7cb749",
"#5855ba",
"#baa93f",
"#6a83e8",
"#c7792a",
"#56a0e4",
"#d15543",
"#43c8ac",
"#b0429b",
"#59be79",
"#b370d6",
"#548030",
"#533080",
"#adae5f",
"#4e62ab",
"#af783d",
"#a987d5",
"#993f2d",
"#d880cc",
"#d0555e",
"#7f2960",
"#e172a3",
"#b9475f")) +
stat_ellipse(data = NationalTrust_data, aes(group = NULL), color="red", type= "norm") +
labs(color = "Source") # Change the legend title to "Source"
# Convert the ggplot2 plot to an interactive Plotly plot
interactive_plotSourcePC <- ggplotly(SourcePC, originalData = FALSE) # Set originalData to FALSE
# Display the interactive plot
interactive_plotSourcePC
Figure 2: Provenance Variation in Native Scots Pine Trees across the UK. This Principal Component Analysis (PCA) illustrates the genetic relationships among 210 Scots pine trees from the UK inferred from 31556 SNPs. Each dot on the plot represents an individual tree, coloured by their source population. The distance between dots reflects the degree of genetic difference or similarity, whereas closer dots represent closer genetic relationships. The ‘Explained Variance’ score quantifies how effectively the PCA simplifies the genetic information. Putative amples highlighted by red ellipse.
An additional 16 samples from putative Native Irish trees were included within the PCA analysis; Dale Wood in County Kerry and Rockforest in County Clare.
# Read in TASSEL 5 PCA values in Plink format from a text file into a data frame
df <- read.table("PC_newLEAFgenotypes_Mothers+NationalTrustSamples+Irish.txt", header = TRUE, sep = "\t")
# Read a Keyfile containing Taxa names and corresponding information of interest into a data frame
Keyfile <- read.table("newLEAFgenotypes_Mothers+NationalTrustSamples_Keyfile.txt", header = TRUE, sep = "\t")
# Merge the two data frames based on specific columns
merged_data <- merge(df, Keyfile, by.x = "FID", by.y = "Taxa", row.names = FALSE)
merged_data$Source <- factor(merged_data$Source, levels=c("Dale Wood", "Rockforest", "Shieldaig", "Loch Clair", "Beinn Eighe", "Cona Glen", "Glen Loy", "Glen Affric", "Rhidorroch", "Glen Cannich", "Glen Einig", "Coille Coire Chuilc", "Crannach", "Alltnacaillich", "Strath Oykel", "Amat", "Meggernie", "Black Wood", "Rothiemurchus", "Abernethy", "Glen Derry", "Allt Cul", "Ballochbuie", "Glen Tanar", "Sheringham Estate", "Felbrigg Estate"))
NationalTrust_data <- subset(merged_data, Source %in% c("Sheringham Estate", "Felbrigg Estate"))
eigen_percent <- read.table("Eigenvalues_newLEAFgenotypes_Mothers+NationalTrustSamples+Irish.txt", row.names = 1, header = TRUE, sep = "\t")
eigen_percent$proportion.of.total <- eigen_percent$proportion.of.total * 100
eigen_percent <- eigen_percent[ ,("proportion.of.total")]
# Create a ggplot2 plot
PC <- ggplot(merged_data, aes(x = PC1, y = PC2, color = Source)) +
geom_point() +
labs(x = paste0("Principal component 1 (",eigen_percent[2]," %)"),
y = paste0("Principal component 2 (",eigen_percent[3]," %)")) +
scale_color_manual(values = c("#ce5183",
"#5ec671",
"#b858b8",
"#9ab03e",
"#5a57bc",
"#d09538",
"#6f81e7",
"#629142",
"#543382",
"#45c097",
"#8a2a66",
"#a0803a",
"#628dd4",
"#bb5338",
"#ba84d5",
"#ba4758",
"#d976b6")) +
stat_ellipse(data = NationalTrust_data, aes(group = NULL), color="red", type = "norm") +
labs(color = "Source") # Change the legend title to "Classification"
interactive_plotPC <- ggplotly(PC, originalData = FALSE) # Set originalData to FALSE
PC2 <- ggplot(merged_data, aes(x = PC2, y = PC3, color = Source)) +
geom_point() +
labs(x = paste0("Principal component 2 (",eigen_percent[3]," %)"),
y = paste0("Principal component 3 (",eigen_percent[4]," %)")) +
scale_color_manual(values = c("#ce5183",
"#5ec671",
"#b858b8",
"#9ab03e",
"#5a57bc",
"#d09538",
"#6f81e7",
"#629142",
"#543382",
"#45c097",
"#8a2a66",
"#a0803a",
"#628dd4",
"#bb5338",
"#ba84d5",
"#ba4758",
"#d976b6")) +
stat_ellipse(data = NationalTrust_data, aes(group = NULL), color="red", type = "norm") +
labs(color = "Source") # Change the legend title to "Classification"
# Convert the ggplot2 plot to an interactive Plotly plot
interactive_plotPC2 <- ggplotly(PC2, originalData = FALSE) # Set originalData to FALSE
subplot(style(interactive_plotPC, showlegend = FALSE), interactive_plotPC2, titleY = TRUE, titleX = TRUE, margin = c(0.01, 0.1, 0, 0), nrows=1) %>%
layout(annotations = list(
list(x = 0, y = 1.05, text = "A", showarrow = F, xref='paper', yref='paper'),
list(x = 0.5, y = 1.05, text = "B", showarrow = F, xref='paper', yref='paper'))
)
Figure 3: Genetic Relationships of Scots Pine Trees of the British Isles. Principal Component Analysis plots (PCA) (A) PC1 vs. PC2. (B) PC2 vs. PC3 showing the genetic relationships among 55 Scots pine trees from the British Isles inferred from 5849 SNPs. Each dot on the plot represents an individual tree, coloured by their source population. The distance between dots reflects the degree of genetic difference or similarity, whereas closer dots represent closer genetic relationships. The “Explained Variance” score quantifies how effectively the PCA simplifies the genetic information. Putative amples highlighted by red ellipse.
Putative native samples were then analysed alongside a data set containing 219 samples from across Europe. The putative native samples clustered alongside the Scottish samples in PCA analysis. ADMIXTURE analysis was also performed with a K=6 having the lowest cross-entropy value.
# Read in TASSEL 5 PCA values in Plink format from a text file into a data frame
df <- read.table("PC_Europe+Scotland_FilteredGenotypes_Proportional_NationalTrustSamples.txt", header = TRUE, sep = "\t")
# Read a Keyfile containing Taxa names and corresponding information of interest into a data frame
Keyfile <- read.table("Keyfile_Europe+Scotland_FilteredGenotypes_Proportional_NationalTrustSamples.txt", header = TRUE, sep = "\t")
# Merge the two data frames based on specific columns
merged_data <- merge(df, Keyfile, by.x = "FID", by.y = "Taxa", row.names = FALSE)
merged_data$Source <- factor(merged_data$Source, levels=c("Ireland", "Scotland", "Sheringham Estate", "Felbrigg Estate", "Spain", "Italy", "Austria", "Poland", "Bulgaria", "Sweden", "Finland", "Siberia", "Turkey"))
NationalTrust_data <- subset(merged_data, Source %in% c("Sheringham Estate", "Felbrigg Estate"))
eigen_percent <- read.table("Eigenvalues_Europe+Scotland_FilteredGenotypes_Proportional_NationalTrustSamples.txt", row.names = 1, header = TRUE, sep = "\t")
eigen_percent$proportion.of.total <- eigen_percent$proportion.of.total * 100
eigen_percent <- eigen_percent[ ,("proportion.of.total")]
# Create a ggplot2 plot
PC <- ggplot(merged_data, aes(x = PC1, y = PC2, color = Source)) +
geom_point() +
labs(x = paste0("Principal component 1 (",eigen_percent[2]," %)"),
y = paste0("Principal component 2 (",eigen_percent[3]," %)")) +
scale_color_manual(values = c("#bb486a",
"#c288d3",
"#8fb745",
"#6d71d8",
"#c49c38",
"#563686",
"#4cc490",
"#be62c1",
"#709243",
"#b14a89",
"#bf6f39",
"#5e8bd5",
"#b64741",
"#bb486a")) +
stat_ellipse(data = NationalTrust_data, aes(group = NULL), color="red", type = "norm") +
labs(color = "Source") # Change the legend title to "Classification"
# Convert the ggplot2 plot to an interactive Plotly plot
interactive_plotPC <- ggplotly(PC, originalData = FALSE) # Set originalData to FALSE
# Display the interactive plot
interactive_plotPC
Figure 3: Genetic Relationships of European Scots Pine Trees. This Principal Component Analysis (PCA) illustrates the genetic relationships among 235 Scots pine trees from across Europe inferred from 5849 SNPs. Each dot on the plot represents an individual tree, coloured by their source population. Autochthonous populations are labelled by their respective country, putative native populations are labelled accordingly. The distance between dots reflects the degree of genetic difference or similarity, whereas closer dots represent closer genetic relationships. The “Explained Variance” score quantifies how effectively the PCA simplifies the genetic information. Putative amples highlighted by red ellipse.
PC2 <- ggplot(merged_data, aes(x = PC2, y = PC3, color = Source)) +
geom_point() +
labs(x = paste0("Principal component 1 (",eigen_percent[2]," %)"),
y = paste0("Principal component 2 (",eigen_percent[3]," %)")) +
scale_color_manual(values = c("#bb486a",
"#c288d3",
"#8fb745",
"#6d71d8",
"#c49c38",
"#563686",
"#4cc490",
"#be62c1",
"#709243",
"#b14a89",
"#bf6f39",
"#5e8bd5",
"#b64741",
"#bb486a")) +
stat_ellipse(data = NationalTrust_data, aes(group = NULL), color="red", type = "norm") +
labs(color = "Source") # Change the legend title to "Classification"
# Convert the ggplot2 plot to an interactive Plotly plot
interactive_plotPC2 <- ggplotly(PC2, originalData = FALSE) # Set originalData to FALSE
# Display the interactive plot
interactive_plotPC2
Figure 3: Genetic Relationships of European Scots Pine Trees. This Principal Component Analysis (PCA) illustrates the genetic relationships among 235 Scots pine trees from across Europe inferred from 5849 SNPs. Each dot on the plot represents an individual tree, coloured by their source population. Autochthonous populations are labelled by their respective country, putative native populations are labelled accordingly. The distance between dots reflects the degree of genetic difference or similarity, whereas closer dots represent closer genetic relationships. The “Explained Variance” score quantifies how effectively the PCA simplifies the genetic information. Putative amples highlighted by red ellipse.
library("cowplot")
plot_grid(PC, PC2,
labels = c("A", "B"),
ncol = 2, nrow = 1)
Figure 3: Genetic Relationships of European Scots Pine Trees. This Principal Component Analysis (PCA) illustrates the genetic relationships among 235 Scots pine trees from across Europe inferred from 5849 SNPs. Each dot on the plot represents an individual tree, coloured by their source population. Autochthonous populations are labelled by their respective country, putative native populations are labelled accordingly. The distance between dots reflects the degree of genetic difference or similarity, whereas closer dots represent closer genetic relationships. The “Explained Variance” score quantifies how effectively the PCA simplifies the genetic information. Putative amples highlighted by red ellipse.
prow <- plot_grid(
PC + theme(legend.position="none"),
PC2 + theme(legend.position="none"),
align = 'vh',
labels = c("A", "B"),
hjust = -1,
nrow = 1
)
prow
Figure 3: Genetic Relationships of European Scots Pine Trees. This Principal Component Analysis (PCA) illustrates the genetic relationships among 235 Scots pine trees from across Europe inferred from 5849 SNPs. Each dot on the plot represents an individual tree, coloured by their source population. Autochthonous populations are labelled by their respective country, putative native populations are labelled accordingly. The distance between dots reflects the degree of genetic difference or similarity, whereas closer dots represent closer genetic relationships. The “Explained Variance” score quantifies how effectively the PCA simplifies the genetic information. Putative amples highlighted by red ellipse.
# extract the legend from one of the plots
legend <- get_legend(
# create some space to the left of the legend
PC + theme(legend.box.margin = margin(0, 0, 0, 12))
)
# add the legend to the row we made earlier. Give it one-third of
# the width of one plot (via rel_widths).
PCgrid <- plot_grid(prow, legend, rel_widths = c(3, 0.8))
ADMIXTURE is a computational method that analyzes genetic data to uncover population substructure and individual ancestry by estimating the proportions of ancestry from different source populations. The optimal number of ancestral populations is infered by the cross-entropy plot.
project <- load.snmfProject("Europe+Scotland_FilteredGenotypes_Proportional_NationalTrustSamples.snmfProject")
# Plot the cross-entropy criterion for all runs in the SNMF project
plot(project, col = "blue", pch = 19, cex = 1.2)
Figure 4; Cross-entropy plot for the number of cluster K = 1-12. The retained value of K is K = 6.
opt.K <- 6
# Select the best run for K = 6 clusters
best = which.min(cross.entropy(project, K = 6))
# Read population data from a file
Populations_data <- read.table("geno_poplist_Europe+Scotland_FilteredGenotypes_Proportional_NationalTrustSamples.txt", header = TRUE, sep = "\t")
# Read the desired order from a file
desired_order <- read.csv("desired_order_Europe+Scotland_FilteredGenotypes_Proportional_NationalTrustSamples.txt", header = FALSE)[, 1]
# Calculate the Q-matrix for the best run
qmatrix <- LEA::Q(project, K = 6, run = best)
qmatrix <- as.data.frame(qmatrix)
# Combine population data and Q-matrix, and sort by desired order
tbl <- cbind(Populations_data, qmatrix)
tbl <- tbl[order(match(tbl$taxa, desired_order)), ]
# Define colors for the barplot
col = c("#50ac72",
"#c45ca2",
"#929d3d",
"#7879cd",
"#c8803e",
"#cc5452")
# Find breakpoints between different populations
breaks <- 0
for (i in 2:nrow(tbl)) {
if (tbl$pop[i] != tbl$pop[i - 1]) {
breaks <- c(breaks, i - 1)
}
}
breaks <- c(breaks, nrow(tbl))
# Calculate spaces and labels for the barplot
spaces <- rep(0, nrow(tbl))
spaces[breaks[2:(length(breaks) - 1)] + 1] <- 2
labels <- rep("", nrow(tbl))
labels[breaks[1:(length(breaks) - 1)] + ((breaks[2:length(breaks)] - breaks[1:(length(breaks) - 1)]) / 2)] <- gsub("_", " ", tbl$pop[breaks])
# Set plot parameters
par(oma = c(4.5, 1, 1, 1), mar = c(3, 3, 2, 1), mgp = c(2, 1, 0), xpd = TRUE)
# Create the barplot
barplot(t(as.matrix(tbl[, 3:8])), col = col, xaxt = "n", border = NA, xlim = c(0, nrow(tbl)), width = 1, space = spaces, names.arg = rep("", nrow(tbl)), ylab="Ancestry proportions", main=paste("K = ",opt.K,sep=""), cex.lab = 1)
# Add x-axis labels with line breaks
axis(1, line = -0.2, at = breaks[1:(length(breaks) - 1)] + ((0:(length(breaks) - 2)) * 2) + ((breaks[2:length(breaks)] - breaks[1:(length(breaks) - 1)]) / 2), lwd = 0, lwd.ticks = 1, labels = gsub("_", " ", tbl$pop[breaks]), las = 2, cex.axis = 0.8)
Figure 5: Admixture Analysis of European Scots Pine Populations (K = 6). This admixture graph displays the genetic composition of 235 European Scots Pine trees (Pinus sylvestris) using a model with K = 6 ancestral components, inferred from 5849 SNPs. Each vertical bar represents an individual tree or a group of trees, and the colours within the bars represent the proportion of ancestry assigned to each of the six genetic clusters.